options(repos = "http://cran.us.r-project.org")
#import and load the required packages
# Install and load required packages
required_packages <- c("ggpubr", "tidyverse", "ggplot2", "plotly", "moments", "DT", "LambertW")
for (package in required_packages) {
if (!require(package, character.only = TRUE)) {
install.packages(package)
library(package, character.only = TRUE)
}
}gapminder_filtered <- filter(gapminder, Year == 1962)
gapminder_plot <- ggplot(gapminder_filtered, aes(x = gdpPercap,
y = `CO2 emissions (metric tons per capita)`)) +
geom_point()
gapminder_plot## Warning: Removed 151 rows containing missing values (`geom_point()`).
correlation <- cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
gapminder_filtered$gdpPercap, use = "complete.obs")
p_value <- cor.test(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
gapminder_filtered$gdpPercap)$p.value
print(paste("The correlation coefficient is", as.character(correlation)))## [1] "The correlation coefficient is 0.926081672501945"
## [1] "The p-value is 1.1286792210055e-46"
We want to test if there is a statistically significant difference in the Energy use per continent. Thus, continent is the predictor variable, and energy use is the outcome variable.
To use a parametric test, we must ensure that three assumptions are met: Normality, equal variances, and independence.
Normality assumption: To check for normality, we use a qqplot.
ggplot(gapminder, aes(x = `Energy use (kg of oil equivalent per capita)`)) +
geom_histogram(bins = 30) +
facet_wrap(~ continent, scales = "free") +
xlab("Energy use (kg of oil equivalent per capita)") +
ylab("Frequency")## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).
As we can see, the data is not normally distributed. Therefore, we use a Kruskal-Wallis test
kruskal.test(gapminder$`Energy use (kg of oil equivalent per capita)`, gapminder$continent, na.action = "na.omit")##
## Kruskal-Wallis rank sum test
##
## data: gapminder$`Energy use (kg of oil equivalent per capita)` and gapminder$continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16
As we can see, the p value is less than 2.2e-16, which is less than 0.05, which means that the energy use varies significantly between at least two continents.
# Create box plots
box_plot <- ggplot(gapminder_years, aes(x = continent, y = `Imports of goods and services (% of GDP)`, fill = continent)) +
geom_boxplot() +
labs(x = "Continent", y = "Imports of goods and services (% of GDP)", fill = "Continent") +
ggtitle("Box Plots of GDP Imports by Continent")
ggplotly(box_plot)## Warning: Removed 12 rows containing non-finite values (`stat_boxplot()`).
# Create density plots
density_plot <- ggplot(gapminder_years, aes(x = `Imports of goods and services (% of GDP)`, fill = continent)) +
geom_density(alpha = 0.5) +
labs(x = "Imports of goods and services (% of GDP)", fill = "Continent") +
ggtitle("Density Plots of GDP Imports by Continent")
ggplotly(density_plot)## Warning: Removed 12 rows containing non-finite values (`stat_density()`).
Visually, the two continent’s import of goods and services are very close with overlapping peaks, although the variances appear to be different. There appears to be 4 outliers in Asia.
# Filter data for the years after 1990
data <- gapminder_years
# Plot Q-Q plot with facet by continent
ggplotly(ggqqplot(data = gapminder_years, x = "`Imports of goods and services (% of GDP)`", facet.by = "continent"))## Warning: Removed 12 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 12 rows containing non-finite values (`stat_qq_line()`).
## Removed 12 rows containing non-finite values (`stat_qq_line()`).
For Asia, there are a few points with a high GDP above the diagonal line. As normality has been violated, it would not be appropriate to use a parametric test, so we use the non-parametric Mann-Whitney-Wilcoxon Test.
result <- wilcox.test(`Imports of goods and services (% of GDP)` ~ continent, data = gapminder_years)
print(result)##
## Wilcoxon rank sum test with continuity correction
##
## data: Imports of goods and services (% of GDP) by continent
## W = 5707, p-value = 0.7867
## alternative hypothesis: true location shift is not equal to 0
As the p-value is greater than 0.05, we did not find a significant difference in ‘Imports of goods and services (% of GDP)’ between Europe and Asia
'Population density (people per sq. km of land area)'
across all years? (i.e., which country has the highest average ranking
in this category across each time point in the dataset?)# Calculating the mean population density by country, then extracting the 5 countries with the greatest population density.
gapminder_pd <- gapminder %>%
group_by(`Country Name`) %>%
summarize(`Mean population density` = mean(`Population density (people per sq. km of land area)`, na.rm = TRUE)) %>%
arrange(desc(`Mean population density`)) %>%
slice_head(n = 5)
datatable(gapminder_pd)mean_pd_plot <- gapminder_pd %>%
ggplot(aes(x = `Country Name`, y = `Mean population density`)) +
ggtitle('Mean population density of the top 5 countries') +
geom_bar(stat = 'Identity')
ggplotly(mean_pd_plot)gapminder_dense <- gapminder_pd %>% slice(1)
colnames(gapminder_dense)[1] <- "Country with the most population density"
datatable(gapminder_dense)As seen from bar chart, Macao SAR, China has the greatest population density across the years.
'Life expectancy at birth, total (years)' between 1962 and
2007?# Get the top 5 countries with the greatest increase in life expectancies
gapminder_difference <-gapminder %>%
filter(Year %in% c(1962, 2007)) %>%
group_by(`Country Name`)%>%
arrange((`Life expectancy at birth, total (years)`)) %>%
reframe(`Difference in Life expectancy (2007 - 1962)` = diff(`Life expectancy at birth, total (years)`)) %>%
arrange(desc(`Difference in Life expectancy (2007 - 1962)`)) %>%
slice(1:5)
# Plotting the top 5 countries
gapminder_difference_plot <- gapminder_difference %>%
ggplot(aes(x = `Country Name`, y = `Difference in Life expectancy (2007 - 1962)`)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 countries with the greatest increase in life expectancy from 1962 to 2007", y = 'Difference in Life expectancy in years for (2007 - 1962)')
ggplotly(gapminder_difference_plot)As seen from the above bar chart, the country whose life expectancy increased the most from 1962 - 2007 is Maldives